********************************************************************************
* MASTER DO-FILE
* Project: The Effects of Mergers & Acquisitions on Workers and Firms in Canada
*
* Purpose : This master script runs all do-files in the correct order.
*           It defines global directory paths and Stata settings, then
*           executes each pipeline stage: data cleaning, matching, and analysis.
*
* Data sources (Statistics Canada, accessed through the VRDC):
*   T4ROE  - Employer-employee earnings records (T4 slips + Record of Employment)
*   T1     - Individual income tax returns (age, sex, UI earnings)
*   NALMF  - National Accounts Longitudinal Microdata File (firm-level accounts)
*   T2s50  - Corporate shareholder data (T2 Schedule 50, ownership panel)
*   SDC Platinum M&A database linked to NALMF firm identifiers
*
* Sample period  : 2001-2017
* Matching window: Firms 2004-2016 | Workers 2005-2016
*
* Pipeline overview:
*   Step 1  Clean Data      -> build core analytical datasets
*   Step 2  Matching        -> propensity-score match treated/control units
*   Step 3  Main Analysis   -> Tables 1-6, Figures 2, B6
*   Step 4  Appendix A      -> robustness checks
*   Step 5  Appendix B      -> heterogeneity analysis
********************************************************************************

*------------------------------------------------------------------------------
* DIRECTORY GLOBALS
* Adjust these paths to match your VRDC project folder structure.
*------------------------------------------------------------------------------
global out	 = "Z:\VRDC-PROJ-6730\Moon_6730\output"                                                      // tables and figures
global data  = "Z:\VRDC-PROJ-6730\Moon_6730\data"                                                        // processed/intermediate datasets
global data0 = "Z:\VRDC-PROJ-6730\PROJECT_6730"                                                          // raw M&A-NALMF linked file
global data1 = "Z:\VRDC-PROJ-6730\PROJECT_6730_10138"                                                    // raw T4ROE, NALMF, T1 data
global data2 = "Z:\VRDC-PROJ-6730\PROJECT_6730_10138\NewFiles\Project_6730_20211026"                     // supplementary firm/worker variables
global data3 = "Z:\VRDC-PROJ-6730\PROJECT_6730_10138\NewFiles\Project_6730_20220222\to_be_transferred"   // geographic identifiers (CZ)
global code	 = "Z:\VRDC-PROJ-6730\Moon_6730\codes\replication_package"                                   // location of this code folder

cd "Z:\VRDC-PROJ-6730\Moon_6730"																		 // working directory

*------------------------------------------------------------------------------
* STATA SETTINGS
*------------------------------------------------------------------------------
clear all
set more off             // suppress --more-- prompts in long output
set matsize 1100         // increase maximum matrix size for large regressions
set maxvar 120000        // allow a large number of variables
set emptycells drop      // drop empty factor-variable cells automatically
set scheme s2color       // default graph color scheme
set rmsg on              // display run-time messages for each command

* Point Stata to user-written ado-files stored on the VRDC server
sysdir set PLUS "ado"

* Load the Mata binary required by the gtools package (fast Stata commands)
run "ado\_\_gtools_internal.mata"

*------------------------------------------------------------------------------
* USER-WRITTEN PROGRAMS
* Defines make_eventstudy, a wrapper around coefplot used in analysis do-files.
*------------------------------------------------------------------------------
do $code/functions.do

********************************************************************************
* REQUIRED STATA PACKAGES
* Install once (requires internet access outside VRDC, or a local ado folder).
********************************************************************************
* install these packages *

/*
ssc install egenmore    // extended egen functions
ssc install fillin      // fill in panel observations
ssc install missings    // utilities for missing values
ssc install ftools
ssc install gtools      // fast alternatives to collapse, egen, etc. (then: gtools, upgrade)
gtools, upgrade
ssc install coefplot    // coefficient and event-study plots
ssc install reghdfe		// two-way fixed effect estimation
ssc install lincomest   // compute linear combination of coefficient
ssc install group2hdfe
*/
********************************************************************************
* STEP 1: DATA CLEANING
* Build the core analytical datasets from raw administrative records.
********************************************************************************

* 1a. Construct the M&A deal-level dataset.
*     Source : SDC Platinum deals linked to NALMF firm identifiers.
*     Keeps completed deals; drops buybacks and self-deals.
*     Outputs: all_mna.dta       (all deals, firm-deal level)
*              first_mna.dta     (first M&A event per firm)
*              repeat_acquirer.dta (count of deals per acquirer)
do $code/clean_mna.do

* 1b. Build the worker-firm panel, moonlighting indicator, and firm employment panel.
*     Source : T4ROE records (2001-2017), all jobs per worker-year.
*     Outputs: worker_firm_panel.dta  (worker x firm x year panel)
*              moonlighter.dta        (workers with simultaneous jobs)
*              emp_panel.dta          (firm x year employment and payroll)
do $code/worker_firm_panel.do

* 1c. Create the firm-level annual dataset for propensity-score matching.
*     Source : NALMF firm accounts, merged with M&A status and AKM/match effects.
*     Outputs: firm_YYYY.dta (one file per year, 2001-2017)
*              firm_level_emp.dta (firm x year employment, used in AKM)
do $code/clean_firm.do

* 1d. Create a panel of all M&A firms (acquirers and targets) over time.
*     Used for Figure 1 (deal count graph).
*     Output: NALMF_mna.dta
do $code/clean_mna_firm.do

* 1e. Estimate firm (employer) fixed effects using the AKM two-way FE model.
*     Restricts to mobile workers in the largest connected set.
*     Output: akm.dta (one firm fixed effect per firm)
do $code/akm_estimation.do

* 1f. Estimate worker-firm match effects using Woodcock's decomposition.
*     Controls for tenure; uses reghdfe to absorb worker and firm FEs.
*     Output: match_effect.dta (one match effect per worker-firm pair)
do $code/match_effect_estimation.do

* 1g. Create the worker-level annual dataset for matching.
*     Merges T4ROE worker records with firm characteristics, geographic
*     information (CZ), AKM effects, and match effects.
*     Also constructs auxiliary datasets:
*       - Firm-level count of commuting zones (CZs)
*       - Indicator for deals with overlapping product markets (CZ x industry)
*       - Indicator for firms that share a market with their M&A counterpart
*       - Panel of "treated markets" (markets containing an M&A firm)
*     Outputs: worker_YYYY.dta, firm_year_cz.dta, same_market.dta,
*              same_market_firm.dta, treated_market.dta
do $code/clean_worker.do

* 1h. Construct market concentration measures at the CZ x industry level.
*     Builds two measures:
*       - Standard HHI based on employment shares
*       - Generalized (flow-adjusted) HHI using worker mobility weights (GHHI)
*     Outputs: concentration.dta, ghhi.dta
do $code/concentration.do

********************************************************************************
* STEP 2: PROPENSITY SCORE MATCHING
* Match M&A firms and workers to comparable non-M&A controls via PSM.
* Matching is done within cells defined by sector, province, and covariate bins.
* psmatch2 is used with a caliper of 1 and 1:1 nearest-neighbor matching.
********************************************************************************

* 2a. Baseline firm matching: covariates are revenue, age, and average wage.
*     Outputs: firm_matched_list.dta (matched pair IDs)
*              firm_matched.dta      (full event-study panel of matched firms)
do $code/matching_firm.do

* 2b. Baseline worker matching: covariate is age; cells include sex and sector.
*     Outputs: worker_matched_list.dta
*              worker_matched.dta (full event-study panel of matched workers)
do $code/matching_worker.do

* 2c. Robustness: add worker tenure as an extra matching covariate.
*     Output: worker_matched_w_tenure.dta
do $code/matching_worker_w_tenure.do

* 2d. Robustness: add within-firm wage distribution percentile as a matching variable.
*     Output: worker_matched_w_wage.dta
do $code/matching_worker_w_wage.do

* 2e. Cross-market robustness: require that matched firm pairs operate in
*     different geographic markets (no market overlap between treated and control).
*     Outputs: firm_matched_diff_market.dta, worker_matched_diff_market.dta
do $code/matching_firm_diff_market.do
do $code/matching_worker_diff_market.do

* 2f. Robustness: include firm Return on Assets (ROA) as an additional
*     firm-level matching variable.
*     Outputs: firm_matched_w_roa.dta, worker_matched_w_roa.dta
do $code/matching_firm_w_roa.do
do $code/matching_worker_w_roa.do

* 2g. Construct the unmatched eligible sample (units that passed selection
*     criteria but may not have been matched). Used for balance-table comparisons.
*     Outputs: firm_eligible_list.dta, worker_eligible_list.dta
do $code/matching_firm_eligible.do
do $code/matching_worker_eligible.do

* 2h. Build the owners' capital gains dataset from T2 Schedule 50 shareholder data.
*     Merges shareholder records with firm-matched and eligible-firm lists.
*     Outputs: firm_owner.dta, eligible_firm_owner.dta
do $code/clean_cap_gain.do


********************************************************************************
* STEP 3: MAIN ANALYSIS
********************************************************************************

* Table 1  : Descriptive statistics (matched sample)
* Table A9 : Pre-matching balance table (firms)
* Table A9 : Pre-matching balance table (workers)
* Figure 1: Number of M&A Events
* Figure A11 : Probability of Exit and Transition to Unemployment
* Figure A12 : Propensity Score Matching Overlap
do $code/descriptive_stats.do

* Table 2  : Firm-level event-study results (employment, payroll, profits, ROA)
*            Figures 2A-H: Event-study plots for firm outcomes
* Table B6 : Firm-level results including revenue, markups, realized capital gains
*            Figures B6A-H: Event-study plots for extended firm outcomes
do $code/analyze_main.do

* Tables 3-6 and corresponding figures: Worker-level event-study results
*   Table 3: Wages and earnings
*   Table 4: Job mobility (moving rates, sector transitions)
*   Table 5: Within-firm wage distribution
*   Table 6: Heterogeneity by worker type
do $code/analyze_worker.do


********************************************************************************
* STEP 4: APPENDIX A - ROBUSTNESS CHECKS
********************************************************************************

* All Appendix A tables (robustness: alternative samples, specifications,
* concentration measures, etc.) except Table A9, Figures A9 and A11 (in descriptive_stats)
do $code/analyze_robust.do


********************************************************************************
* STEP 5: APPENDIX B - HETEROGENEITY ANALYSIS
********************************************************************************

* All Appendix B tables (heterogeneity by market overlap, deal type,
* worker characteristics, etc.) except Table B6
do $code/analyze_hetero.do

